import os
NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/MOmaps'
LOGS_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/logs/")
PLOT_PATH = os.path.join(NOVA_HOME, "outputs/preprocessing/ManuscriptFinalData_80pct/neuronsDay18/QC_plots")
os.chdir(NOVA_HOME)
import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
run_calc_hist_new
from tools.preprocessing_tools.qc_reports.qc_config import opera18days_panels, opera18days_markers, opera18days_marker_info, \
opera18days_cell_lines, opera18days_cell_lines_to_cond,\
opera18days_cell_lines_for_disp, opera18days_reps, \
opera18days_line_colors, opera18days_lines_order, \
opera18days_custom_palette, opera18days_expected_dapi_raw, \
markers
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
df = log_files_qc(LOGS_PATH,only_wt_cond=False, filename_split='-',site_location=0)
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch2 reading logs of batch1 Total of 2 files were read. Before dup handeling (137224, 21) After duplication removal #1: (137224, 22) After duplication removal #2: (137224, 22)
# choose batches
batches = ['batch1', 'batch2']
batches
['batch1', 'batch2']
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'Opera18DaysReimaged_sorted')
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, opera18days_panels, opera18days_markers.copy(),PLOT_PATH, opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches_raw, fig_height=12)
batch1 Folder structure is valid. No bad files are found. Total Sites: 82000
======== batch2 Folder structure is invalid. Missing 2 paths: /home/projects/hornsteinlab/Collaboration/MOmaps/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep1/CLTC /home/projects/hornsteinlab/Collaboration/MOmaps/input/images/raw/Opera18DaysReimaged_sorted/batch2/FUSHomozygous/panelD/stress/rep2/CLTC No bad files are found. Total Sites: 81800
======== ====================
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'neuronsDay18')
procs = run_validate_folder_structure(root_directory_proc, True, opera18days_panels, opera18days_markers,PLOT_PATH,opera18days_marker_info,
opera18days_cell_lines_to_cond, opera18days_reps, opera18days_cell_lines_for_disp, opera18days_expected_dapi_raw,
batches=batches, fig_height=12)
batch1 Folder structure is valid. No bad files are found. Total Sites: 65339
======== batch2 Folder structure is invalid. Missing 1 paths: /home/projects/hornsteinlab/Collaboration/MOmaps/input/images/processed/ManuscriptFinalData_80pct/neuronsDay18/batch2/FUSHomozygous/stress/CLTC No bad files are found. Total Sites: 63365
======== ====================
display_diff(batches, raws, procs, PLOT_PATH,fig_height=12)
batch1
======== batch2
========
#for batch in list(range(3,9)) + ['7_16bit','8_16bit','9_16bit']:
for batch in batches:
with contextlib.redirect_stdout(io.StringIO()):
var = sample_and_calc_variance(root_directory_proc, batch,
sample_size_per_markers=200, cond_count=2, rep_count=len(opera18days_reps),
num_markers=len(opera18days_markers))
print(f'{batch} var: ',var)
batch1 var: 0.02297973057841009 batch2 var: 0.022649935159345384
By order of filtering
Percentage out of the total sites
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, opera18days_line_colors, opera18days_panels,
figsize=(10,6), reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if Cellpose found 0 cells in it.
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner,
opera18days_line_colors, opera18days_panels, reps = opera18days_reps,
figsize=(10,6))
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.
A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose,
opera18days_line_colors, opera18days_panels, figsize=(10,6),
reps = opera18days_reps)
Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
figsize=(10,10), markers=opera18days_markers)
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, opera18days_markers)
## Are we using FMRP?
markers_for_d18 = markers.copy()
markers_for_d18.remove('TIA1')
total_sum[total_sum.marker.isin(markers_for_d18)].n_valid_tiles.sum()
442372
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
80745.0
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
288327.0
show_total_sum_tables(total_sum)
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch1 | ||||
| count | 820.000000 | 820.000000 | 820.000000 | 820.000000 |
| mean | 329.431707 | 3.294317 | 188.884146 | 634.993902 |
| std | 171.621788 | 1.716218 | 197.492387 | 467.847871 |
| min | 26.000000 | 0.260000 | 15.000000 | 39.000000 |
| 25% | 221.000000 | 2.210000 | 100.750000 | 391.000000 |
| 50% | 327.000000 | 3.270000 | 159.000000 | 607.000000 |
| 75% | 441.000000 | 4.410000 | 221.000000 | 846.000000 |
| max | 1396.000000 | 13.960000 | 2375.000000 | 5331.000000 |
| sum | 270134.000000 | NaN | 154885.000000 | 520695.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n_valid_tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| batch2 | ||||
| count | 818.000000 | 818.000000 | 818.000000 | 818.000000 |
| mean | 295.196822 | 2.951968 | 148.358191 | 548.536675 |
| std | 153.066434 | 1.530664 | 97.099994 | 323.790277 |
| min | 7.000000 | 0.070000 | 8.000000 | 8.000000 |
| 25% | 179.000000 | 1.790000 | 84.000000 | 317.000000 |
| 50% | 268.000000 | 2.680000 | 128.000000 | 508.000000 |
| 75% | 404.750000 | 4.047500 | 197.000000 | 768.250000 |
| max | 1005.000000 | 10.050000 | 951.000000 | 2801.000000 |
| sum | 241471.000000 | NaN | 121357.000000 | 448703.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
| n valid tiles | % valid tiles | site_whole_cells_counts_sum | site_cell_count | |
|---|---|---|---|---|
| All batches | ||||
| count | 1638.000000 | 1638.000000 | 1638.000000 | 1638.000000 |
| mean | 312.335165 | 3.123352 | 168.645910 | 591.818071 |
| std | 163.469872 | 1.634699 | 156.939281 | 404.600454 |
| min | 7.000000 | 0.070000 | 8.000000 | 8.000000 |
| 25% | 189.500000 | 1.895000 | 91.000000 | 343.000000 |
| 50% | 306.000000 | 3.060000 | 143.000000 | 559.000000 |
| 75% | 429.750000 | 4.297500 | 209.000000 | 793.500000 |
| max | 1396.000000 | 13.960000 | 2375.000000 | 5331.000000 |
| sum | 511605.000000 | NaN | 276242.000000 | 969398.000000 |
| expected_count | 450.000000 | 450.000000 | 450.000000 | 450.000000 |
For each batch, cell line, replicate and markerTotal number of tiles
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count_sum',
title='Cell Count Average per Site (from tiles)')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_whole_cells_counts_sum',
title='Whole Cell Count Average per Site')
plot_cell_count(df_no_empty_sites, opera18days_lines_order, opera18days_custom_palette, y='site_cell_count',
title='Cellpose Cell Count Average per Site')
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
cell_line_cond FUSHeterozygous Untreated 3.686806 FUSHomozygous Untreated 3.997165 FUSHomozygous stress 3.198085 FUSRevertant Untreated 2.290094 OPTN Untreated 3.227312 SNCA Untreated 2.434939 TBK1 Untreated 2.604735 TDP43 Untreated 2.848064 WT Untreated 4.617641 WT stress 3.527682 Name: n_valid_tiles, dtype: float64
df_dapi[['site_cell_count']].mean()
site_cell_count 6.202715 dtype: float64
plot_catplot(df_dapi, opera18days_custom_palette, opera18days_reps,
x='n_valid_tiles', x_title='valid tiles count', batch_min=1, batch_max=2)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1017: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
plot_hm(df_dapi, split_by='rep', rows='cell_line', columns='panel')
for batch in batches:
print(batch)
run_calc_hist_new(f'{batch}',opera18days_cell_lines_for_disp, opera18days_markers,
root_directory_raw, root_directory_proc, hist_sample=10,
sample_size_per_markers=200, ncols=7, nrows=5)
print("="*30)
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_d18_Opera_80pct.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData_80pct/neuronsDay18.html')
Error executing Jupyter command 'nbconvert': [Errno 2] No such file or directory
256